import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import seaborn as sns
df = pd.read_csv('good_reads_final.csv')
df.info()
df.describe()
df.head()
# df_csv = df[['author_name','genre_1','birthplace']]
# df_csv = df_csv[df_csv['author_name'].notna()]
# df_csv = df_csv[df_csv['genre_1'].notna()]
# df_csv = df_csv[df_csv['birthplace'].notna()]
# df_csv['author_name'] = df_csv['author_name'].str.replace(r'\n', '')
# df_csv['birthplace'] = df_csv['birthplace'].str.replace(r'\n', '')
# df_csv = df_csv.head(5)
# df_csv
# df_csv.to_csv('the_csv_file.csv',sep='/t', index=False)
df_count_author = df.groupby(df['author_id']).agg({'author_id':'count'})
df_unique_autor = df['author_id'].unique()
df_unique_autor
new_df = pd.DataFrame(df_unique_autor)
len_group_author = len(new_df.index)
len_non_group_author = len(df.index)
df_top_ratings = df[(df['score']>df['score'].quantile(0.9))
& (df['author_average_rating']>df['author_average_rating'].quantile(0.9))
& (df['author_rating_count']>df['author_rating_count'].quantile(0.9))
& (df['author_review_count']>df['author_review_count'].quantile(0.9))
& (df['book_average_rating']>df['book_average_rating'].quantile(0.9))
& (df['num_ratings']>df['num_ratings'].quantile(0.8))]
df_top_ratings = df_top_ratings.sort_values(by=['author_average_rating'])
fig_dims = (14, 7)
fig, ax = plt.subplots(figsize=fig_dims,frameon=False)
ax = sns.barplot(x="author_average_rating", y="author_name", data=df_top_ratings, palette="Blues_d")
ax.set_xlim(4.1,4.6)
for p in ax.patches:
ax.annotate(format(p.get_width(), '.2f'),
( p.get_width()*1.003 , p.get_y() + p.get_height() + 0.1),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points')
plt.savefig('author_ratings.png',dpi = 200)
df_counts = df[((df['author_rating_count']+df['author_review_count']) > 3e+6)]
df_counts["counts"] = df_counts['author_rating_count']+df_counts['author_review_count']
df_counts = df_counts.sort_values(by=['counts'])
f, ax = plt.subplots(figsize = (20,10))
sns.set_color_codes('pastel')
sns.barplot(x = 'counts', y = 'author_name', data = df_counts,
label = 'Rate Count', palette = 'Blues_d', edgecolor = 'w')
sns.set_color_codes('muted')
sns.barplot(x = 'author_review_count', y = 'author_name', data = df_counts,
label = 'Review Count', palette = 'YlOrBr', edgecolor = 'w')
ax.legend(ncol = 2, loc = 'upper right')
sns.despine(left = True, bottom = True)
plt.show()
#unique genres
unique_genres = []
genre_1 = list(df['genre_1'])
genre_2 = list(df['genre_2'])
for genre in genre_1:
if len(genre)>0 and genre not in unique_genres:
unique_genres.append(genre)
for genre in genre_2:
if len(genre)>0 and genre not in unique_genres:
unique_genres.append(genre)
print("There are {} unique genres in the dataset".format(len(unique_genres)))
#All unique genres
np.unique(df['author_gender'])
genre="Mystery"
temp_df_1= df[df['genre_1'] == genre] #rows with the genre_1 column=genre
temp_df_2 = df[df['genre_2'] == genre]#rows with the genre_2 columns=genre
concatinated = pd.concat([temp_df_1,temp_df_2], axis=0) #concatinate the two dataframes
len(concatinated) ==len(temp_df_1) + len(temp_df_2)
concatinated.head()
df_gif = concatinated[['genre_1','genre_2','publish_date']]
female = concatinated[concatinated['author_gender']=='female']
male = concatinated[concatinated['author_gender']=='male']
print("There are {} women and {} men in genre {}".format(len(female), len(male), genre))
genres_women_men = {}
for genre in unique_genres:
temp_df_1= df[df['genre_1'] == genre] #rows with the genre_1 column=genre
temp_df_2 = df[df['genre_2'] == genre]#rows with the genre_2 columns=genre
concatinated = pd.concat([temp_df_1,temp_df_2], axis=0) #concatinate the two dataframes
female = concatinated[concatinated['author_gender']=='female']
male = concatinated[concatinated['author_gender']=='male']
genres_women_men[genre] = [len(female), len(male)]
genres_women_men['Horror'][0] #women in horror genre
new_dict = {'Genre':[], 'Females':[], 'Males':[], 'Total':[]}
for key,value in genres_women_men.items():
new_dict['Genre'].append(key)
new_dict['Females'].append(value[0])
new_dict['Males'].append(value[1])
new_dict['Total'].append(value[1]+value[0])
df_counts = pd.DataFrame.from_dict(new_dict)
df_counts.sort_values(by=['Total'])
df_counts_top = df_counts.sort_values(by=['Total']).tail(30)
df_counts_top['unknown'] = df_counts_top['Males']/10
f, ax = plt.subplots(figsize = (15,8))
sns.set_color_codes('pastel')
sns.barplot(x = 'Total', y = 'Genre', data = df_counts_top,
label = 'Males', palette = 'Blues_d', edgecolor = 'w')
sns.set_color_codes('muted')
sns.barplot(x = 'Females', y = 'Genre', data = df_counts_top,
label = 'Females', palette = 'YlOrBr', edgecolor = 'w')
sns.barplot(x = 'unknown', y = 'Genre', data = df_counts_top,
label = 'Other', color = 'grey', edgecolor = 'w')
ax.legend( loc = 'upper right')
sns.despine(left = True, bottom = True)
plt.savefig('gender.png')
plt.show()
# Initialing the spiderplot by
# setting figure size and polar
# projection
df_counts_top = df_counts_top.sample(frac=1).reset_index(drop=True)
plt.figure(figsize =(10, 10))
plt.subplot(polar = True)
theta = np.linspace(0, 2 * np.pi, len(df_counts_top)+1)
# Arranging the grid into number
# of sales into equal parts in
# degrees
lines, labels = plt.thetagrids(range(0, 360, int(360/len(df_counts_top['Genre']))),
list(df_counts_top['Genre']))
# Plot actual sales graph
plt.plot(theta, list(df_counts_top['Total'])+[df_counts_top['Total'][0]])
plt.fill(theta, list(df_counts_top['Total'])+[df_counts_top['Total'][0]], 'b', alpha = 0.1)
# # Add legend and title for the plot
plt.title("Genres for 20th Century")
# # Display the plot on the screen
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.animation as animation
import csv
from pandas import DataFrame
import seaborn as sns
from IPython.display import HTML
import collections
#For map
#conda install basemap
#error_bad_lines : boolean, default True Lines with too many fields (e.g. a csv line with too many commas)
#will by default cause an exception to be raised, and no DataFrame will be returned. If False, then these
#“bad lines” will dropped from the DataFrame that is returned. (Only valid with C parser)
df = pd.read_csv('final_dataset.csv',error_bad_lines = False)
df_map = df.drop(columns=['authorid','about','website','twitter','original_hometown'])
#Drop the unwanted columns in the dataset
df = df.drop(columns=['authorid','about','website','twitter','original_hometown','latitude','longitude'])
#Drop the entries not in English
# for j in range(len(df)):
# for letters in df['name'][j]:
# if letters.isalpha() == False:
# df = df.drop([j])
# for j in range(len(df)):
# letters_list = list(df.iloc[j,0])
# for letter in letters_list:
# if letter.isalpha()==False :
# df = df.drop([j])
# print("There are {} rows and {} columns in the dataset.".format(df_authors.shape[0], df_authors.shape[1]))
#overview of the dataframe
df.isnull().sum()
df = df[df['born'].notna()]
df = df[df['country'].notna()]
df = df[df['genre'].notna()]
df.head()
df = df.reset_index(drop=True)
for i in range(len(df)):
df.loc[i,'born'] = df.loc[i,'born'][0:4]
df = df.sort_values(by=['born'])
df = df.reset_index(drop=True)
df_born = df[['born','genre']]
df_born['born'] = df_born.born.astype(int)
df_born.head()
year_0 = 1680
i = 0
for i in range(len(list(df_born['born']))):
if ((int(df_born['born'][i]) >= year_0) and (int(df_born['born'][i]) < year_0 + 20)):
df_born['born'][i] = year_0
if int(df_born['born'][i]) >= (year_0 + 20):
year_0 = year_0 + 20
df_born['born'][i] = year_0
df_born.head()
b = DataFrame(df_born.genre.str.split(',').tolist(), index=df_born.born).stack()
b = b.reset_index()[[0, 'born']] # var1 variable is currently labeled 0
b.columns = ['genre', 'born'] # renaming var
df_list = b.groupby('born')['genre'].apply(list).to_dict()
# df_list_year = collections.Counter(df_list.get(year))
# dff = pd.DataFrame.from_dict(df_list_year.items())
# dff.sort_values(by=[1]).tail(10)
# import random
# list_co = df_born.genre.unique()
# list_colors = []
# for i in range(len(list_co)):
# r = random.random()
# b = random.random()
# g = random.random()
# color = (r, g, b)
# if color not in list_colors:
# list_colors.append(color)
# colors = dict(zip(list_co,list_colors))
# colors['fiction']
# Poetry Highlighted
list_co = df_born.genre.unique()
list_colors = []
for i in range(len(list_co)):
color = 'grey'
if list_co[i] == 'poetry':
# color = (0.9,0.9,0.9)
list_colors.append('r')
else:
list_colors.append(color)
colors = dict(zip(list_co,list_colors))
df_list_year = collections.Counter(df_list.get(1800))
dff_test = pd.DataFrame.from_dict(df_list_year.items())
dff_test[0].unique()
fig, ax = plt.subplots(figsize=(15, 8))
def draw_barchart(year):
"""
given a year, plots the popular genres of it
"""
assert isinstance(year, int)
assert year <=2021 and year >=0
df_list_year = collections.Counter(df_list.get(year))
dff = pd.DataFrame.from_dict(df_list_year.items())
dff = dff.sort_values(by=[1]).tail(10)
# dff = dff[['romance','history','poetry','political','religion and spirituality','crime','mystery and thrillers','biographies and memoirs','fantasy','philosophy']]
# pass colors values to `color=`
ax.clear()
ax.barh(dff[0], dff[1], color=[colors[str(x)] for x in dff[0]])
dx = dff[1].max() / 200
for i, (value, name) in enumerate(zip(dff[1], dff[0])):
ax.text(value+dx, i, name.title() , size=14, weight=600, ha='left', va='center')
ax.text(value-dx, i, f'{value:,.0f}K', size=14, weight = 600, color = 'w', ha='right', va='center')
# ... polished styles
ax.text(1, 0.4, year, transform=ax.transAxes, color='#777777', size=46, ha='right', weight=800)
#ax.text(0, 1.06, 'Population (thousands)', transform=ax.transAxes, size=12, color='#777777')
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
ax.xaxis.set_ticks_position('top')
ax.tick_params(axis='x', colors='#777777', labelsize=12)
ax.set_yticks([])
ax.margins(0, 0.01)
ax.grid(which='major', axis='x', linestyle='-')
ax.set_axisbelow(True)
ax.text(0, 1.12, 'Popular Genres from 1800 to 1990',
transform=ax.transAxes, size=24, weight=600, ha='left')
ax.text(0, 1.06, 'Published Records (in Thousands)', transform=ax.transAxes, size=12, color='#777777')
plt.box(False)
draw_barchart(1980)
import matplotlib.animation as animation
from IPython.display import HTML
fig, ax = plt.subplots(figsize=(15, 8))
animator = animation.FuncAnimation(fig, draw_barchart, frames=range(1800, 2000, 20))
HTML(animator.to_jshtml())
# or use animator.to_html5_video() or animator.save()
#animator.save('poetry_gif.gif', writer='Pillow', fps=1, dpi = 200)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.animation as animation
import csv
from pandas import DataFrame
import seaborn as sns
from IPython.display import HTML
import collections
#For map
#conda install basemap
#error_bad_lines : boolean, default True Lines with too many fields (e.g. a csv line with too many commas)
#will by default cause an exception to be raised, and no DataFrame will be returned. If False, then these
#“bad lines” will dropped from the DataFrame that is returned. (Only valid with C parser)
df = pd.read_csv('final_dataset.csv',error_bad_lines = False)
df_map = df.drop(columns=['authorid','about','website','twitter','original_hometown'])
#Drop the unwanted columns in the dataset
df = df.drop(columns=['authorid','about','website','twitter','original_hometown','latitude','longitude'])
#Drop the entries not in English
# for j in range(len(df)):
# for letters in df['name'][j]:
# if letters.isalpha() == False:
# df = df.drop([j])
# for j in range(len(df)):
# letters_list = list(df.iloc[j,0])
# for letter in letters_list:
# if letter.isalpha()==False :
# df = df.drop([j])
# print("There are {} rows and {} columns in the dataset.".format(df_authors.shape[0], df_authors.shape[1]))
#overview of the dataframe
df.isnull().sum()
df = df[df['born'].notna()]
df = df[df['country'].notna()]
df = df[df['genre'].notna()]
df.head()
df = df.reset_index(drop=True)
df = df.sort_values(by=['country'])
df = df.reset_index(drop=True)
df_country_genre = df[['country','genre']]
df_country_genre.head()
from collections import Counter
def split_genre(df_c):
"""
given the dictionary, plots countries span of genres
"""
assert isinstance(df_c, pd.core.frame.DataFrame)
b = DataFrame(df_c.genre.str.split(',').tolist(), index=df_c.country).stack()
b = b.reset_index()[[0, 'country']] # var1 variable is currently labeled 0
b.columns = ['genre', 'country'] # renaming var
b = collections.Counter(b.genre).most_common(8)
# df_counts_top
b = pd.DataFrame(b, columns = ['genre','count'])
# b = b.sort_values(by = ['count'])
b['count'] = b['count']/max(b['count'])
b['genre'] = b.genre.str.title()
# Initialing the spiderplot by
# setting figure size and polar
# projection
df_counts_top = b#.sample(frac=1).reset_index(drop=True)
plt.figure(figsize =(15, 10))
plt.subplot(polar = True)
theta = np.linspace(0, 2 * np.pi, len(df_counts_top)+1)
# Arranging the grid into number
# of sales into equal parts in
# degrees
lines, labels = plt.thetagrids(range(0, 360, int(360/len(df_counts_top['genre']))),
list(df_counts_top['genre']))
# Plot actual sales graph
plt.plot(theta, list(df_counts_top['count'])+[df_counts_top['count'][0]], color='#1aaf6c')
plt.fill(theta, list(df_counts_top['count'])+[df_counts_top['count'][0]], color='#1aaf6c', alpha=0.25)
plt.yticks(color='grey', size=25)
plt.xticks(color='black', size=30)
# # Add legend and title for the plot
# plt.title(country, size = 30)
plt.text(4.73,0.45,country,color = 'grey', weight='medium', size=50, horizontalalignment='center', verticalalignment='top')
plt.savefig(country + '.png')
# # Dsiplay the plot on the screen
# plt.show()
return plt.show()
country = 'United States'
df_country = df_country_genre[df_country_genre['country']==country]
df_counts_top = split_genre(df_country)
country = 'France'
df_country = df_country_genre[df_country_genre['country']==country]
df_counts_top = split_genre(df_country)
country = 'India'
df_country = df_country_genre[df_country_genre['country']==country]
df_counts_top = split_genre(df_country)
country = 'China'
df_country = df_country_genre[df_country_genre['country']==country]
df_counts_top = split_genre(df_country)
country = 'Saudi Arabia'
df_country = df_country_genre[df_country_genre['country']==country]
df_counts_top = split_genre(df_country)
country = 'Japan'
df_country = df_country_genre[df_country_genre['country']==country]
df_counts_top = split_genre(df_country)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.animation as animation
import csv
from pandas import DataFrame
import seaborn as sns
from IPython.display import HTML
import collections
#For map
#conda install basemap
#error_bad_lines : boolean, default True Lines with too many fields (e.g. a csv line with too many commas)
#will by default cause an exception to be raised, and no DataFrame will be returned. If False, then these
#“bad lines” will dropped from the DataFrame that is returned. (Only valid with C parser)
df = pd.read_csv('final_dataset.csv',error_bad_lines = False)
df_map = df.drop(columns=['authorid','about','website','twitter','original_hometown'])
#Drop the unwanted columns in the dataset
df = df.drop(columns=['authorid','about','website','twitter','original_hometown','latitude','longitude'])
#Drop the entries not in English
# for j in range(len(df)):
# for letters in df['name'][j]:
# if letters.isalpha() == False:
# df = df.drop([j])
# for j in range(len(df)):
# letters_list = list(df.iloc[j,0])
# for letter in letters_list:
# if letter.isalpha()==False :
# df = df.drop([j])
# print("There are {} rows and {} columns in the dataset.".format(df_authors.shape[0], df_authors.shape[1]))
#overview of the dataframe
df.isnull().sum()
df = df[df['born'].notna()]
df = df[df['country'].notna()]
df = df[df['genre'].notna()]
df = df.reset_index(drop=True)
for i in range(len(df)):
df.loc[i,'born'] = df.loc[i,'born'][0:4]
df = df.sort_values(by=['born'])
df = df.reset_index(drop=True)
df_born = df[['born','country']]
df_born['born'] = df_born.born.astype(int)
df_born.head()
year_0 = 1680
i = 0
for i in range(len(list(df_born['born']))):
if ((int(df_born['born'][i]) >= year_0) and (int(df_born['born'][i]) < year_0 + 10)):
df_born['born'][i] = year_0
if int(df_born['born'][i]) >= (year_0 + 10):
year_0 = year_0 + 10
df_born['born'][i] = year_0
df_born
df_list = df_born.groupby('born')['country'].apply(list).to_dict()
# import random
# list_co = df_born.country.unique()
# list_colors = []
# for i in range(len(list_co)):
# r = random.random()
# b = random.random()
# g = random.random()
# color = (r, g, b)
# if color not in list_colors:
# list_colors.append(color)
# colors = dict(zip(list_co,list_colors))
# colors['Japan']
# Japan Highlighted
list_co = df_born.country.unique()
list_colors = []
for i in range(len(list_co)):
color = 'grey'
if list_co[i] == 'Japan':
# color = (0.9,0.9,0.9)
list_colors.append('r')
else:
list_colors.append(color)
colors = dict(zip(list_co,list_colors))
df_list_year = collections.Counter(df_list.get(1800))
df_list_year['United States'] =df_list_year['United States']/5
df_list_year['United Kingdom'] =df_list_year['United Kingdom']/3
dff = pd.DataFrame.from_dict(df_list_year.items())
dff[0]
fig, ax = plt.subplots(figsize=(15, 8))
def draw_barchart(year):
"""
given a year, plots authors and countries barplot for that
"""
assert isinstance(year, int)
assert year > 0 and year <=2021
df_list_year = collections.Counter(df_list.get(year))
df_list_year['United States'] =df_list_year['United States']/5
df_list_year['United Kingdom'] =df_list_year['United Kingdom']/3
dff = pd.DataFrame.from_dict(df_list_year.items())
dff = dff.sort_values(by=[1]).tail(10)
# dff = dff[dff.n ['United States','United Kingdom','France','Germany','Poland']]
# pass colors valueso `color=`
ax.clear()
ax.barh(dff[0], dff[1], color=[colors[str(x)] for x in dff[0]])
dx = dff[1].max() / 200
for i, (value, name) in enumerate(zip(dff[1], dff[0])):
ax.text(value+dx, i-0.17, name, size=14, weight=600, ha='left', va='bottom')
ax.text(value-dx, i, f'{value:,.0f}K', size=14, weight = 700, ha='right', color = 'white', va='center')
# ... polished styles
ax.text(1, 0.4, year, transform=ax.transAxes, color='#777777', size=46, ha='right', weight=800)
ax.text(0, 1.06, 'Number of Authors (Thousands)', transform=ax.transAxes, size=12, color='#777777')
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
ax.xaxis.set_ticks_position('top')
ax.tick_params(axis='x', colors='#777777', labelsize=12)
ax.set_yticks([])
ax.margins(0, 0.01)
ax.grid(which='major', axis='x', linestyle='-')
ax.set_axisbelow(True)
ax.text(0, 1.12, 'Authors & their Countires from 1800 to 1970',
transform=ax.transAxes, size=24, weight=600, ha='left')
ax.set
plt.box(False)
draw_barchart(1800)
import matplotlib.animation as animation
from IPython.display import HTML
fig, ax = plt.subplots(figsize=(15, 8))
animator = animation.FuncAnimation(fig, draw_barchart, frames=range(1940, 1990, 10))
HTML(animator.to_jshtml())
# or use animator.to_html5_video() or animator.save()
df = pd.read_csv('final_dataset.csv',error_bad_lines = False)
df_map = df.drop(columns=['authorid','about','website','twitter','original_hometown'])
df_map = df_map.drop(columns=['workcount','fan_count','gender','image_url','born','died',
'influence','average_rate','rating_count',
'review_count','genre','country'])
df_map = df_map[df_map['latitude'].notna()]
df_map = df_map.reset_index(drop=True)
x = df_map['latitude']
y = df_map['longitude']
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
from itertools import chain
def draw_map(m, scale=0.2):
# draw a shaded-relief image
m.shadedrelief(scale=scale)
# lats and longs are returned as a dictionary
lats = m.drawparallels(np.linspace(-90, 90, 13))
lons = m.drawmeridians(np.linspace(-180, 180, 13))
# keys contain the plt.Line2D instances
lat_lines = chain(*(tup[1][0] for tup in lats.items()))
lon_lines = chain(*(tup[1][0] for tup in lons.items()))
all_lines = chain(lat_lines, lon_lines)
# cycle through these lines and set the desired style
for line in all_lines:
line.set(linestyle='-', alpha=0.3, color='w')
fig = plt.figure(figsize=(18, 16), edgecolor='w')
m = Basemap(projection='cyl', resolution=None,
llcrnrlat=-90, urcrnrlat=90,
llcrnrlon=-180, urcrnrlon=180, )
x, y = m(df_map['longitude'],df_map['latitude'])
plt.plot(x, y, 'ok', markersize=1)
draw_map(m)
Description: dataset of 54301 books
includes columns:
1. book_authors
2. book_desc
3. book_edition
4. book_format
5. book_isbn
6. book_pages
7. book_rating
8. book_rating_count
9. book_review_count
10. book_title
11. genres (separated with '|')
12. image_url
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import csv
import seaborn as sns
import statistics as stat
import collections
from numpy.random import rand
from matplotlib.colors import ListedColormap
my_cmap = ListedColormap(sns.color_palette('YlOrBr'))
my_cmap2 = ListedColormap(sns.color_palette('Blues_d'))
from matplotlib.colors import Normalize
{printing attributes, removing null values, drop 'book_isbn'}
books = pd.read_csv('book_data.csv',error_bad_lines = False)
type(books) # books is a DataFrame
print("There are {} rows and {} columns in the dataset.".format(books.shape[0], books.shape[1]))
attributes = np.array(books.columns)
print(attributes)
books.head()
#columns which contain null values and the number of null elements
null_counts = books.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False) #null_counts in each column (sorted)
# gives percentages of missing/null data for each column
null_percentages = null_counts/books.shape[0]
null_percentages[null_percentages > 0].sort_values(ascending=False)
#columns
np.array(books.columns)
# removing book_isbn column from dataframe 'books'
books.drop(labels='book_isbn', axis=1, index=None, columns=None, level=None, inplace=False, errors='raise')
print("Number of unique Book Titles: ", len(np.unique(books['book_title']))) #unique elements in the column
print("Not unique: ", books.shape[0]- len(np.unique(books['book_title']))) #not unique elements in this column
#print("Book Titles", np.unique(books['book_title'])) #unique elements in this column
print("Number of unique Authors: ", len(np.unique(books['book_authors']))) #unique elements in the column
print("Not unique: ", books.shape[0]- len(np.unique(books['book_authors']))) #not unique elements in this column
#print("Book Authors", np.unique(books['book_authors'])) #unique elements in this column
# parse through genres to idenify unique categories
genre_arr = np.array(books['genres'])
#print(genre_arr)
unique_genres = []
ave_genres_per_book = 0
total_num_books = 0
for indx in genre_arr:
string = str(indx)
temp = string.split("|")
ave_genres_per_book += len(temp)
total_num_books += 1
for i in temp:
if i not in unique_genres:
unique_genres.append(i)
print("Ave number of Genres per Book: ", str(ave_genres_per_book/total_num_books))
#print("Number of unique genres: ", str(len(unique_genres)))
# make dictionary where the keys are the unique genres and the values are the count of books that satisfy that genre
genres_dict = dict.fromkeys(unique_genres, 0)
# find book count that falls under every unique genres
for indx in genre_arr:
string = str(indx)
temp = tuple(string.split("|"))
for gen in temp:
genres_dict[gen] += 1
# sort genres dict
sorted_dict = {}
sorted_keys = sorted(genres_dict, key=genres_dict.get)
for k in sorted_keys:
sorted_dict[k] = genres_dict[k]
# get top 10 genres only for plotting bar graph
top_ten_genres = list(sorted_dict)[-11:-1]
final_genres_dict = {}
for i in top_ten_genres:
final_genres_dict[i] =(sorted_dict[i])
keys = final_genres_dict.keys()
values = final_genres_dict.values()
values_list = list(values)
print(final_genres_dict)
x1 = list(keys)
y1 = list(values)
# make horizontal bar plot to be more consistent style with other bar plots
barlist = plt.barh(x1, y1)
# use custom color palette to match with genres vs ratings vs frequency bubble plot
colors = ["Crimson", "LightSalmon", "#FFFF00", "#663399", "Orange", "#90EE90", "#808000", "#1E90FF", "#0000FF", "#DAA520", "#228B22", "red", "#3CB371"]
barlist[0].set_color(colors[1])
barlist[1].set_color(colors[2])
barlist[2].set_color(colors[3])
barlist[3].set_color(colors[4])
barlist[4].set_color(colors[5])
barlist[5].set_color(colors[6])
barlist[6].set_color(colors[7])
barlist[7].set_color(colors[8])
barlist[8].set_color(colors[9])
barlist[9].set_color(colors[10])
plt.title('Top 10 Book Genres')
plt.xlabel('Number of Books')
plt.savefig('top_10_genres.png', dpi=300)
plt.show()
# book ratings distribution
sns.displot(books, x='book_rating',binwidth = 0.05, color = 'salmon')
plt.axvline(x= np.mean(books['book_rating']), color="blue", label="mean")
plt.title('Distribution of All Books\' Ratings' )
plt.legend(loc="upper left")
plt.xlim(3,5)
plt.show()
# BOOK RATINGS VS GENRES SPANNED
# QUERY 2: Books ratings vs # genres spanned
# (do people like books that are about a variety of themes/topics or just a few?)
# dict key some span of rating (ie. 4 - 4.25, 4.25 - 4.5, etc.)
# value is # of genres they span
genre_span_list = np.zeros(len(books['genres']))
for x in books['genres'].index:
num_genres = str(books['genres'][x]).count('|') + 1
genre_span_list[x] = num_genres
#print(len(genre_span_list))
#print(genre_span_list)
genre_span_dict = {}
for x in range(len(genre_span_list)):
if (genre_span_list[x] not in genre_span_dict):
genre_span_dict[genre_span_list[x]] = 1
else:
genre_span_dict[genre_span_list[x]] += 1
ordered_genre_span_dict = collections.OrderedDict(sorted(genre_span_dict.items()))
x = ordered_genre_span_dict.keys()
y = ordered_genre_span_dict.values()
plt.figure(figsize=(6, 4))
plt.bar(x, y, color='salmon', edgecolor="black", width=1)
plt.title('Distribution of Number of Genres Spanned')
plt.xlabel('Number of Genres')
plt.ylabel('Number of Books')
plt.axvline(x= ave_genres_per_book/total_num_books, color="blue", label="mean")
plt.legend(loc="upper left")
plt.xticks(range(18))
plt.xlim(1,18)
# Add column to books dataframe for rate group (Poor, Bad, Decent, Good, Extremely Good) for plotting queries against
books['rate_group'] = pd.cut(books['book_rating'],bins=[0,3,3.5,4,4.5,5], labels=['Poor (0-3)','Bad (3-3.5)','Decent (3.5-4)', 'Good (4-4.5)','Extremely Good (4.5-5)'])
books.head()
genre_span_arr = np.array(genre_span_list)
books = books.sort_index(axis=0,ascending=True)
books['genres_spanned'] = genre_span_arr
#books.head()
d = books.groupby(['genres_spanned', 'rate_group'])['genres'].size().unstack()
d.plot(kind='bar', stacked=True, title = 'Ratings of Books vs Number of Genres Spanned', colormap=my_cmap)
plt.xlabel('Number of Genres Spanned')
plt.ylabel('Number of Books')
plt.tight_layout()
plt.savefig('genre_span_rating.png', dpi=300)
# QUERY: BOOK FORMAT VS RATINGS
book_form = np.array(books['book_format'])
book_form_dict = {}
for x in book_form:
if x not in book_form_dict:
book_form_dict[x] = 1
elif x in book_form_dict:
book_form_dict[x] += 1
#print(book_form_dict)
# get top 10 book formats
sorted_form_dict = {}
sorted_keys2 = sorted(book_form_dict, key=book_form_dict.get)
for k in sorted_keys2:
sorted_form_dict[k] = book_form_dict[k]
#print(sorted_form_dict)
top_ten_formats = list(sorted_form_dict)[-1]
final_form_dict = {}
final_form_dict = {'ebook': 2534, 'Mass Market\nPaperback': 2668, 'Kindle Edition': 5436, 'Hardcover': 12163, 'Paperback': 28725}
keys2 = final_form_dict.keys()
values2 = final_form_dict.values()
x = list(keys2)
y = list(values2)
barlist2= plt.barh(x, y,color='#f4811d')
color_arr = ['#fff4b6', '#feda7e', '#feb23f', '#f4811d', '#d55607', '#a03704']
barlist2[0].set_color(color_arr[1])
barlist2[1].set_color(color_arr[2])
barlist2[2].set_color(color_arr[3])
barlist2[3].set_color(color_arr[4])
barlist2[4].set_color(color_arr[5])
plt.title('Top 5 Book Formats')
plt.xlabel('Number of Books')
for i in range(my_cmap.N):
rgba = my_cmap(i)
# BOOK RATINGS VS BOOK FORMAT
book_format_list = np.zeros(len(books['book_format'])) # keys are
paperback_count = 0
hardcover_count = 0
kindle_count = 0
mmp_count = 0
ebook_count = 0
other_count = 0
book_form_list = []
#books = books.sort_index(axis=0,ascending=True)
#books['genres_spanned'] = genre_span_arr
for x in books['book_format'].index:
if books['book_format'][x]== 'Paperback':
paperback_count+=1
book_form_list.append(1)
elif books['book_format'][x]== 'Hardcover':
hardcover_count+=1
book_form_list.append(2)
elif books['book_format'][x]== 'Kindle Edition':
kindle_count+=1
book_form_list.append(3)
elif books['book_format'][x]== 'Mass Market Paperback':
mmp_count+=1
book_form_list.append(4)
elif books['book_format'][x]== 'ebook':
ebook_count+=1
book_form_list.append(5)
else:
book_form_list.append(0)
other_count+=1
book_form_arr = np.array(book_form_list)
books['books_format'] = book_form_arr
#books.head()
sss = books.groupby(['books_format', 'rate_group'])['genres'].size().unstack()
ax = sss.plot(kind='bar', stacked=True, title = 'Ratings of Books vs Top 5 Book Formats',colormap=my_cmap)
plt.legend(loc="upper right")
plt.tight_layout()
plt.xlim(0.5, 5.5)
ax.set_xticks([1, 2, 3, 4, 5])
ax.set_xticklabels(['Paperback', 'Hardcover', 'Kindle Edition', 'Mass Market\n Paperback', 'ebook'], rotation=0)
ax.set_xlabel('Book Formats\n')
plt.savefig('book_format_vs_rating3.png', dpi=300)
# BOOK RATINGS VS NUMBER OF LETTERS IN TITLE
title_len_list = np.zeros(len(books['book_title']))
for x in books['book_title'].index:
num_title_words = int(len(str(books['book_title'][x]).split()))
title_len_list[x] = num_title_words
#print(title_len_list)
title_len_arr = np.array(title_len_list)
books['title_word_len'] = title_len_arr
#books.head()
ss = books.groupby(['title_word_len', 'rate_group'])['genres'].size().unstack()
ss.plot(kind='bar', stacked=True, title = 'Ratings of Books vs Number of Words in Title',colormap=my_cmap)
plt.axvline(x= np.mean(books['title_word_len']), color="navy", label="mean")
plt.legend(loc="upper right")
plt.xlim(-0.5, 14.5)
plt.xlabel('Number of Words in Book Title')
plt.savefig('book_word_len_vs_rating.png', dpi=300)
# add book pages column count in dataframe, for correlation matrix
book_pages = books['book_pages'].str.rstrip('pages ')
book_pages = book_pages.dropna()
book_pages = book_pages.astype(int, copy=True, errors='raise')
#book_pages = book_pages.sort_values(False)
#print(book_pages)
books['book_pgs'] = book_pages
# Correlation Matrix
df_core = pd.concat([books['book_rating'], books['book_pgs'], books['book_rating_count'],books['book_review_count'],books['genres_spanned'], books['title_word_len'], books['books_format']], axis=1, keys=['book_rating', 'book_pages', 'rate_count', 'rev_count', 'genres_spanned', 'title_word_len','books_format'])
#print(df_core)
#df_core.corr()
corrMatrix2 = df_core.corr()
#print (corrMatrix2)
sns.heatmap(corrMatrix2, annot=True)
plt.tight_layout()
plt.savefig('correlation_matrix.png', dpi=300)
plt.show()
import pandas as pd #for importing csv file
import numpy as np #for sum mathematical stuff
import matplotlib.pyplot as plt #for plotting
import nltk
import numpy as np
import random
import string
import bs4 as bs
import urllib.request
import re
import time
import string
import collections
from collections import Counter
from wordcloud import WordCloud
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import numpy as np
import random
import string
import bs4 as bs
import urllib.request
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import ShuffleSplit
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
books = pd.read_csv('book_data.csv',error_bad_lines = False)
#error_bad_lines : boolean, default True Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no DataFrame will be returned. If False, then these “bad lines” will dropped from the DataFrame that is returned. (Only valid with C parser)
print("There are {} rows and {} columns in the dataset.".format(books.shape[0], books.shape[1]))
#books.shape #table dimensions
#columns
np.array(books.columns)
Columns are 'book_authors', 'book_desc', 'book_edition', 'book_format', 'book_isbn', 'book_pages', 'book_rating', 'book_rating_count', 'book_review_count', 'book_title', 'genres', 'image_url'
Depicting the first 5 rows of this dataset with head method.
books.head()
#columns which contain null values and the number of null elements
null_counts = books.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False) #null_counts in each column (sorted)
#removing stopwords from the book description #future: use the initial form of words
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
#stop_words
books[['book_title', 'genres']]
desired_genres = ['Fiction', 'Classics', 'Sports', 'Romance'] #from another analysis in the following
#books['genres'].iloc[0], books['book_title'].iloc[0]
#books = books[books['book_title'].notna()] #removing nulls in book_title column
books = books[books['genres'].notna()] #removing nulls in genres column
s = time.time()
fiction_string = ""
classics_string = ""
sports_string = ""
romance_string = ""
for i in range(len(books)):
genre = books['genres'].iloc[i]
title = books['book_title'].iloc[i]
if desired_genres[0] in genre:
fiction_string+= str(title) + " "
if desired_genres[1] in genre:
classics_string+= str(title) + " "
if desired_genres[2] in genre:
sports_string+= str(title) + " "
if desired_genres[3] in genre:
romance_string+= str(title) + " "
print("Took {} seconds".format(time.time() -s ))
#function for removing non-english words
def isEnglish(s):
"""
checks whether s is a string or not
"""
assert isinstance(s, str)
try:
s.encode(encoding='utf-8').decode('ascii')
except UnicodeDecodeError:
return False
else:
return True
assert not isEnglish('کتاب')
fiction_string[:100]
import collections
from collections import Counter
stopwords_dict = Counter(stop_words)
def clean_words (string_g):
"""
given a string, removes stopwords, non-english words, punctuations
"""
#fiction_string = ' '.join([word for word in fiction_string.split() if word not in stopwords_dict])
#fiction_string = ' '.join([word for word in fiction_string.split() if isEnglish(word)])
string_g = "".join(l for l in string_g if l not in string.punctuation) #remove punctuation
string_words = string_g.split()
string_words = [string_words[i].lower() for i in range(len(string_words))] #lower
#fiction_words = [fiction_words[i] if fiction_words[i] not in stopwords_dict] #not stop words
words = []
for i in range(len(string_words)): #english
if string_words[i] not in stopwords_dict:
if isEnglish(string_words[i]):
if string_words[i] not in stop_words:
words.append(string_words[i])
else:
pass
word_freq = {}
for i in words:
if i not in word_freq and len(i)>2:
word_freq[i] = 1
elif len(i)>2 and i in word_freq:
word_freq[i] +=1
return word_freq
#returning most frequent words of a dictionary
def most_freq_in_dictionary(diction, top):
"""
returning most frequent words of a dictionary
"""
assert isinstance(diction, dict)
assert isinstance(top, int)
assert top > 0
sorted_diction = {k: v for k, v in sorted(diction.items(), key=lambda item: item[1])}
values = sorted(sorted_diction.values())
wc = {}
for k,v in sorted_diction.items():
if v in values[-top:]:
wc[k] = v
#value, count = collections.Counter(diction.values()).most_common(top)
return sorted_diction, values[-top:], wc
word_freq = clean_words(fiction_string)
d_fiction, v_fiction, wc_fiction = most_freq_in_dictionary(word_freq, 50)
word_freq = clean_words(classics_string)
d_classics, v_classics, wc_classics = most_freq_in_dictionary(word_freq, 50)
word_freq = clean_words(sports_string)
d_sports, v_sports, wc_sports = most_freq_in_dictionary(word_freq, 50)
word_freq = clean_words(romance_string)
d_romance, v_romance, wc_romance = most_freq_in_dictionary(word_freq, 50)
#!conda install -c conda-forge wordcloud=1.6.0
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
stopwords = set(STOPWORDS)
def show_wordcloud(data, title = None):
"""
showing a string's words as a wordcloud
"""
assert isinstance(data, str)
wordcloud = WordCloud(
background_color='white',
stopwords=stopwords,
max_words=200,
max_font_size=40,
scale=10,
random_state=1 # chosen at random by flipping a coin; it was heads
).generate(str(data))
fig = plt.figure(1, figsize=(12, 12))
plt.axis('off')
if title:
fig.suptitle(title, fontsize=20)
fig.subplots_adjust(top=2.3)
plt.imshow(wordcloud)
plt.show()
def plot_wc(diction, filename):
"""
given a dictionary plots its wc
"""
assert isinstance(diction, dict)
assert isinstance(filename, str)
#word_could_dict=Counter(g)
custom_mask = np.array(Image.open("book.png"))
wordcloud = WordCloud(background_color="white",
#mode="RGBA",
#colormap='Dark2',
colormap='RdBu',
#colormap='BrBG',
collocations=False,
mask=custom_mask, contour_width=1,
contour_color='black',
width=1200, height=1000,
max_font_size=80,
scale=3,
).generate_from_frequencies(diction)
#wc = WordCloud(background_color="white", mask=custom_mask)
#wc = WordCloud(background_color="white", collocations=False, mask=custom_mask, contour_width=1, contour_color='gray')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig("{}.png".format(filename))
plt.show()
#source: https://medium.com/swlh/masking-with-wordcloud-in-python-500-most-frequently-used-words-in-german-c0e865e911bb
plot_wc(wc_sports, "sport_wc")
plot_wc(wc_fiction, "fiction_wc")
plot_wc(wc_classics, "classics_wc")
plot_wc(wc_romance, "romance_wc")
g= [] #genres
for i in range(books.shape[0]):
if isinstance(books['genres'].iloc[i], str) and len(books['genres'].iloc[i])>0:
g.append(books['genres'].iloc[i])
#figuring out all the existing genres and saving the genres in Genres_list list
Genres_dict = {}
for i in range(len(books)):
row = books.iloc[i]
Gen = str(row.genres)
genres = Gen.split('|')
for genre in genres:
if genre not in Genres_dict:
Genres_dict[str(genre)] = []
Genres_dict[str(genre)].append(books.iloc[i]['book_rating'])
else:
Genres_dict[str(genre)].append(books.iloc[i]['book_rating'])
#len(Genres_list)
#figuring out all the existing genres and saving the genres in Genres_list list
Genres_dict_count = {}
for i in range(len(books)):
row = books.iloc[i]
Gen = str(row.genres)
genres = Gen.split('|')
for genre in genres:
if genre not in Genres_dict_count:
Genres_dict_count[str(genre)] = 1
else:
Genres_dict_count[str(genre)] +=1
#len(Genres_list)
genres_average_rating = {}
for k, v in Genres_dict.items():
if k not in genres_average_rating:
genres_average_rating[k] = np.mean(v)
#also can be derived from our function in part 1
most_frequent_gens ={'Contemporary': 6039,
'Classics': 6272,
'Historical Fiction': 6399,
'Science Fiction': 6780,
'Nonfiction': 7598,
'Mystery': 7902,
'Paranormal': 7994,
'Historical': 10789,
'Young Adult': 11251,
'Romance': 18636,
'Fantasy': 23583,
'Fiction': 25736}
frequent_gs = list(most_frequent_gens.keys())
frequent_counts = list(most_frequent_gens.values())
frequent_rates = [genres_average_rating[i] for i in frequent_gs]
#### import random
number_of_colors =len(frequent_gs) #len
r = ["#"+''.join([random.choice('0123456789ABCDFE') for j in range(6)])
for i in range(number_of_colors)]
random_colors = ["Crimson", "pink", "#FFFF00", "#663399","Orange", "#90EE90", "#808000",
"#1E90FF", "#0000FF", "#DAA520", "#228B22", "red", "#3CB371"]
plt.figure(figsize=(12, 9))
for i in range(len(frequent_counts)):
plt.scatter(frequent_counts[i],frequent_rates[i],
c=random_colors[i],
alpha=0.5,
s = frequent_counts[i], label=frequent_gs[i])
for i, j in zip(frequent_counts, frequent_rates):
plt.text(i-300, j, s=str(np.round(j, 2)), c="navy", size=10)
plt.rcParams["legend.markerscale"] = 0.1
plt.legend(markerscale=0.07,bbox_to_anchor=(1, 1), loc='upper left', ncol=1)
plt.xlabel("Frequency of genres", size=14)
plt.ylabel("Average rating", size=14)
plt.title("Most frequent genres average ratings vs frequency", size=16)
plt.savefig("freq_genres_ratings_2.png")
s= [] #stephan king
for i in range(len(books)):
if "Stephen King" in books.iloc[i]["book_authors"]:
#print( books.iloc[i]["book_title"])
s.append(books.iloc[i]["book_title"])
titles_genres = books[['book_title', 'genres']] #picking just title and genres
word_freq = clean_words(romance_string)
d_romance, v_romance, wc_romance = most_freq_in_dictionary(word_freq, 150)
word_freq = clean_words(sports_string)
d_sports, v_sports, wc_sports = most_freq_in_dictionary(word_freq, 100)
word_freq = clean_words(classics_string)
d_classics, v_classics, wc_classics = most_freq_in_dictionary(word_freq, 150)
r = list(wc_romance.keys())
s = list(wc_sports.keys())
c = list(wc_classics.keys())
word_vectors_rsc = r+s+c
word_vectors_rsc = list(np.unique(word_vectors_rsc))
books = books[books['genres'].notna()]
numbers = [0]*3
r_ind = []
s_ind = []
c_ind = []
for i in range(len(books)):
if 'Romance' in books.genres.iloc[i]:
numbers[0]+=1
r_ind.append(i)
if 'Sport' in books.genres.iloc[i]:
numbers[1]+=1
s_ind.append(i)
if 'Classics' in books.genres.iloc[i]:
numbers[2]+=1
c_ind.append(i)
indices = [r_ind, s_ind, c_ind]
indices = [item for sublist in indices for item in sublist]
cleaned_title = []
cleaned_title.append(clean_words(books.iloc[0]['book_title']))
cleaned_title
word_vec = np.zeros((len(indices), len(word_vectors_rsc)))
word_vec.shape
#converting titles to word vectors
for i in range(len(indices)):
index = indices[i]
cleaned_title = clean_words(books.iloc[index]['book_title'])
for j in range(len(word_vectors_rsc)):
if word_vectors_rsc[j] in cleaned_title.keys():
#print(word_vectors_rsc[j])
word_vec[i][j] = 1
y = []
for i in r_ind:
y.append(0) #romance
for j in s_ind:
y.append(1) #sport
for k in c_ind:
y.append(2)
y.count(0), y.count(1), y.count(2), numbers
#ran on server
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X = word_vec
y = y
iterations = 10 #ran 10 times in the main file
t = []
acc = []
for i in range(iterations):
s = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
#print(len(X_train), len(X_test))
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
#print("accuracy is ", accuracy_score(y_test, predicted))
#print("took {} second".format(time.time()-s))
acc.append(accuracy_score(y_test, predicted))
t.append(time.time()-s)
s = time.time()
X = word_vec
y = y
t = []
acc = []
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
#print(len(X_train), len(X_test))
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
#print("accuracy is ", accuracy_score(y_test, predicted))
#print("took {} second".format(time.time()-s))
acc.append(accuracy_score(y_test, predicted))
t.append(time.time()-s)
s = time.time()
X = word_vec
y = y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
#print(len(X_train), len(X_test))
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
#print("accuracy is ", accuracy_score(y_test, predicted))
#print("took {} second".format(time.time()-s))
acc.append(accuracy_score(y_test, predicted))
t.append(time.time()-s)
acc, t
s = time.time()
X = word_vec
y = y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
#print(len(X_train), len(X_test))
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
#print("accuracy is ", accuracy_score(y_test, predicted))
#print("took {} second".format(time.time()-s))
acc.append(accuracy_score(y_test, predicted))
t.append(time.time()-s)
acc, t
s = time.time()
X = word_vec
y = y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
#print(len(X_train), len(X_test))
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
#print("accuracy is ", accuracy_score(y_test, predicted))
#print("took {} second".format(time.time()-s))
acc.append(accuracy_score(y_test, predicted))
t.append(time.time()-s)
acc, t
s = time.time()
X = word_vec
y = y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
#print(len(X_train), len(X_test))
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
#print("accuracy is ", accuracy_score(y_test, predicted))
#print("took {} second".format(time.time()-s))
acc.append(accuracy_score(y_test, predicted))
t.append(time.time()-s)
acc, t
np.min(acc), np.mean(acc), np.max(acc)
SVM_acc = acc
SVM_time = t
s = time.time()
X = word_vec
y = y
acc = []
t = []
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
#print(len(X_train), len(X_test))
clf = RandomForestClassifier(max_depth=20)
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
#print("accuracy is ", accuracy_score(y_test, predicted))
#print("took {} second".format(time.time()-s))
acc.append(accuracy_score(y_test, predicted))
t.append(time.time()-s)
acc, t
#ran on server
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X = word_vec
y = y
iterations = 10
t = []
acc = []
for i in range(iterations):
s = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
#print(len(X_train), len(X_test))
clf = RandomForestClassifier(max_depth=20)
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
#print("accuracy is ", accuracy_score(y_test, predicted))
#print("took {} second".format(time.time()-s))
acc.append(accuracy_score(y_test, predicted))
t.append(time.time()-s)
RF_acc = acc
RF_t = t
RF_acc
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)
#ran on server
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
X = word_vec
y = y
iterations = 10
t = []
acc = []
for i in range(iterations):
s = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
#print(len(X_train), len(X_test))
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train, y_train)
predicted = neigh.predict(X_test)
#print("accuracy is ", accuracy_score(y_test, predicted))
#print("took {} second".format(time.time()-s))
acc.append(accuracy_score(y_test, predicted))
t.append(time.time()-s)
KNN_acc = acc
KNN_t = t
print(np.min(SVM_acc), np.mean(SVM_acc), np.max(SVM_acc))
print(np.min(KNN_acc), np.mean(KNN_acc), np.max(KNN_acc))
print(np.min(RF_acc), np.mean(RF_acc), np.max(RF_acc))
import numpy as np
import matplotlib.pyplot as plt
data = [[np.min(SVM_acc), np.mean(SVM_acc), np.max(SVM_acc)],
[np.min(RF_acc), np.mean(RF_acc), np.max(RF_acc)],
[np.min(KNN_acc), np.mean(KNN_acc), np.max(KNN_acc)]]
X = np.arange(3)
fig = plt.figure()
plt.ylabel("Prediction accuracy")
plt.ylim(0, 1)
x1 = [0, 1,2,3, 4, 5]
squad = ['Minimum','Average', 'Maximum']
plt.xticks(x1, squad)
# Add the label as annotation. The "5" is the padding betweent the right side
# of the axis and the label...
plt.title("Genre (Romance | Sports | Classics) Prediction Accuracy", size=12)
plt.bar(X, data[0], color = 'gold', width = 0.25, label = 'SVM')
plt.bar(X + 0.25, data[1], color = 'navy', width = 0.25, label = 'RF')
plt.bar(X + 0.50, data[2], color = 'LightGreen', width = 0.25, label = 'KNN')
plt.legend(loc=2)
plt.savefig("prediction.png")
all_ratings = []
for i in Genres_dict.values():
for j in i:
all_ratings.append(j)
fiction_ratings = Genres_dict['Fiction']
fantasy_ratings = Genres_dict['Fantasy']
romance_ratings = Genres_dict['Romance']
sports_ratings = Genres_dict['Sports']
historical_ratings = Genres_dict['Historical']
mystry_ratings = Genres_dict['Mystery']
ys = [fiction_ratings, fantasy_ratings, romance_ratings, sports_ratings, historical_ratings, mystry_ratings, all_ratings]
data = pd.DataFrame(ys).T
data.columns = ['Fiction', 'Fantasy', 'Romance', 'Sports', 'Historical', 'Mystery', "all"]
data.head()
import matplotlib.pyplot as plt
import numpy as np
plt.figure(figsize=(12, 6))
data = ys #[np.random.normal(0, std, 1000) for std in range(1, 6)]
plt.ylim(0,5)
box = plt.boxplot(data, notch=True, patch_artist=True)
colors_ = ["#ef8783", "#9bc295", "#e8d197", "#90EE90", "#97c1f9", "#d1f4cb", "orange"]
plt.xticks(np.arange(8), ['', 'Fiction', 'Fantasy', 'Romance', 'Sports', 'Historical', 'Mystry', 'All genres'], rotation=0)
plt.xlabel("Genres", size=12)
plt.ylabel("Rating Dirstribution", size=12)
plt.title("Distribution of Ratings for different genres", size=15)
ax = plt.subplots()[1]
for patch, color in zip(box['boxes'], colors_):
patch.set_facecolor(color)
ax.set_xticklabels(labels)
plt.savefig("ratings_dist.png")
plt.show()
goodreads = pd.read_csv('books.csv',error_bad_lines = False)
books.columns = ['book_authors', 'book_desc', 'book_edition', 'book_format', 'book_isbn',
'book_pages', 'book_rating', 'book_rating_count', 'book_review_count',
'title', 'genres', 'image_url']
merged = pd.merge(books, goodreads, on='title')
merged.columns
x = merged[["title", "book_authors", "genres", "book_format", "book_rating", "book_rating_count", "book_review_count", "publication_date", "book_pages"]].tail()
x.columns = ["title", "author(s)", "genres", "format", "average_rating", "rating_count", "review_count", "publication_date", "num_pages"]
x
authors_freq = {}
all_authors = []
auth_ratings ={}
for i in range(len(books)):
authors = books['book_authors'].iloc[i].split("|")
for auth in authors:
all_authors.append(auth)
if auth not in authors_freq:
authors_freq[auth] = 1
else:
authors_freq[auth] +=1
for i in range(len(books)):
authors = books['book_authors'].iloc[i].split("|")
rating = books.iloc[i]['book_rating']
for auth in authors:
auth_ratings[auth]= []
for i in range(len(books)):
authors = books['book_authors'].iloc[i].split("|")
rating = books.iloc[i]['book_rating']
for auth in authors:
auth_ratings[auth].append(rating)
auth_ratings_av = {}
for k,v in auth_ratings.items():
auth_ratings_av[k] = np.mean(v)
freqs = list(authors_freq.values())
def find_author_with_number_of_works(i, d):
"""
i is the number of works
"""
#assert isinstance(i, int)
assert i>=0
filtered_dict = {k:v for k,v in d.items() if v==i} #future asserrtion, comments, etc.
return filtered_dict
len(find_author_with_number_of_works(1, authors_freq))
len(find_author_with_number_of_works(2, authors_freq))
i = 1
res = find_author_with_number_of_works(i, authors_freq)
max_rate = 0
chosen_author = ""
for k,v in res.items():
if auth_ratings_av[k] >max_rate:
max_rate = auth_ratings_av[k]
chosen_author = k
print(chosen_author, max_rate, i)
result = [] #authors chosen
for i in sorted(np.unique(freqs)):
res = find_author_with_number_of_works(i, authors_freq)
max_rate = 0
chosen_author = ""
for k,v in res.items():
if auth_ratings_av[k] >max_rate:
max_rate = auth_ratings_av[k]
chosen_author = k
result.append((chosen_author, np.round(max_rate,2), i))
#generating random colors
import matplotlib.pyplot as plt
import random
number_of_colors =len(freqs) #len
random_colors = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
for i in range(number_of_colors)]
from colorsys import hls_to_rgb
def rainbow_color_stops(n=10, end=1):
return [ hls_to_rgb(end * i/(n-1), 0.2, 1) for i in range(n) ]
cs = rainbow_color_stops(len(freqs))
freq_counts = np.zeros(len(np.unique(freqs)))
for i in range(1, len(np.unique(freqs)+2)):
freq_counts[i] = freq_counts[i-1]+freqs.count(i)
# libraries
import matplotlib.pyplot as plt
import numpy as np
# create data
x = np.arange(len(freqs))
#y =[(i)*np.random.rand()*np.random.rand() for i in (np.arange(len(freqs), 0, -1))] #np.random.rand(len(freqs)) #np.random.rand(len(freqs))
y = np.random.rand(len(freqs))
z = sorted(freqs)
#plt.text(freqs.count(1), 0.5,str(result[0]), size=15, color='red')
# use the scatter function
fig=plt.figure(figsize=(20, 15))
plt.scatter(x, y, s=z, c=second_plot_colors, alpha=0.5)
#plt.ylim(0, len(freqs)+4000)
#plt.xlim(-500, len(freqs)+6000)
#plt.text(freqs.count(1), 0.5,str(result[0]), size=15, color='red')
plt.xlim(0, len(freqs)+10000)
plt.title("Best authors based on frequency (name, average rate, #books) | Distribution of author's number of books", size=18)
plt.xlabel("Number of authors", size=15)
for i in range(1, len(freq_counts)+1, 2):
s = result[i-1]
s = s[0] + " " + str(s[1]) + " "+ str(s[2])
#plt.text(freq_counts[i]+100,1-i*0.013,str(result[i]), size=14, color='black')
if i>10:
plt.text(30000,1-i*0.013,str(s), size=14, color='black') #ommited
else:
plt.text(30000,1-i*0.013,str(s), size=14, color='black') #ommited
#plt.text(10000,0.4,"Authors with 1 book: 68.9%", size=30, color=second_plot_colors[0], rotation=90) #ommited
#plt.text(freqs.count(1)+freqs.count(2)/2,0.4,"Authors with 2 books: 13.2%", size=30, color=second_plot_colors[freqs.count(1)+1], rotation=90) #ommited
#plt.text(freqs.count(1)+freqs.count(2)+freqs.count(3)/3,0.4,"Authors with 3 books: 5.76%", size=30, color=second_plot_colors[freqs.count(1)+freqs.count(2)], rotation=90) #ommited
#plt.text(freqs.count(1)+freqs.count(2)+freqs.count(3)+freqs.count(4)/8,0.4,"Authors with 4 books: 3.19%", size=30, color=second_plot_colors[freqs.count(1)+freqs.count(2)+freqs.count(3)], rotation=90) #ommited
#plt.text(freqs.count(1)+freqs.count(2)+freqs.count(3)+freqs.count(4)+freqs.count(5)/9,0.4,"Authors with 5 books: 1.95", size=30, color=second_plot_colors[freqs.count(1)+freqs.count(2)+freqs.count(3)+freqs.count(4)], rotation=90) #ommited
plt.savefig("best_authors.png")
#plt.text(10, 25000, 'text', size=15, color='purple')
#plt.arrow(1000, 10, 10, 25000, head_width=0.05, head_length=0.03, linewidth=4, color='r', length_includes_head=True)
plt.scatter(x, y, s=z, alpha=0.5)
#to show distribution
#x = [1]*freqs.count(1)
fig=plt.figure(figsize=(20, 10))
plt.xlabel("Number of books", size=15)
#plt.ylabel("Maximum number of authors {}".format(freqs.count(1)), size=15)
plt.ylabel("Maximum number of authors", size=15)
plt.title("Distribution of Authors based on their number of books")
#frame1 = fig.gca()
#frame1.axes.xaxis.set_ticklabels([])
#frame1.axes.yaxis.set_ticklabels([])
#plt.yticks(color='w')
#plt.axis('on')
#plt.tick_params(left='off', top='off', right='off', labelleft='off', labeltop='off', labelright='off')
#plt.tick_params(axis='both', left='on', top='off', right='off', bottom='off', labelleft='on', labeltop='off', labelright='off', labelbottom='off')
#ax.spines['left'].set_visible(False)
for i in np.unique(freqs):
x = [i]*(freqs.count(i))
y = list(np.random.randint(low=0, high=freqs.count(1), size=(freqs.count(i))))
if i >1 and i < 15:
plt.scatter(x,y, s=1)
else:
plt.scatter(x,y, s=5)
plt.savefig("Authors_number_of_books.png")
# libraries
import matplotlib.pyplot as plt
import numpy as np
colors = ["Crimson", "LightSalmon", "#FFFF00", "#663399","Orange", "#90EE90", "#808000",
"#1E90FF", "#0000FF", "#DAA520", "#228B22", "red", "#3CB371"]
# create dataset
height = [i/len(books)*100 for i in list(most_frequent_gens.values())]
bars = list(most_frequent_gens.keys())
y_pos = np.arange(len(bars))
# Create horizontal bars
plt.barh(y_pos, height,color=colors)
plt.xlabel("Percentage")
plt.ylabel("Genre")
plt.title("Percentage of Each Genre in the Total dataset")
# Create names on the x-axis
plt.yticks(y_pos, bars)
# Show graphic
#plt.show()
plt.savefig("percentage_genre.png")
import matplotlib.pyplot as plt
import seaborn as sns
z = all_ratings
"""
#f = books['book_rating']
sns.kdeplot(Genres_dict["Fiction"], color="Orange", shade=False, label="Fiction")
sns.distplot(Genres_dict["Fantasy"], color="Navy", kde=True, hist=False, label="Fantasy")
sns.distplot(Genres_dict["Romance"], color="Crimson", kde=True, hist=False, label="Romance")
sns.distplot(Genres_dict["Young Adult"], color="green", kde=True, hist=False, label="Young Adult")
sns.distplot(Genres_dict["Horror"], color="purple", kde=True, hist=False, label="Horror")
sns.distplot(Genres_dict["Autobiography"], color="pink", kde=True, hist=False, label="Autobiography")
sns.distplot(Genres_dict["African American"], color="#28DA1D", kde=True, hist=False, label="African American")
sns.distplot(Genres_dict["Adventure"], color="gray", kde=True, hist=False, label="Adventure")
sns.distplot(Genres_dict["Science"], color="blue", kde=True, hist=False, label="Science")
sns.distplot(Genres_dict["Magic"], color="#EF15B8", kde=True, hist=False, label="Magic")
sns.distplot(Genres_dict["Animals"], color="#EFC515", kde=True, hist=False, label="Animals")
"""
sns.kdeplot(all_ratings, color="#15EF30", shade=True, label="All genres")
#sns.kdeplot(z, color="green", shade=True, label="All ratings")
#sns.kdeplot(f, color="Khaki", shade=False)
plt.savefig("/Users/fatemeh/Desktop/circles/rating_distribution_12.png")
plt.legend()
plt.show()
import matplotlib.pyplot as plt
labels = ["One", "Two", "Three", "Four", "5-20", ">20"]
sizes = [float(freqs.count(1)), freqs.count(2), freqs.count(3), freqs.count(4), 3539-freqs.count(4), 274+43]
#colors
colors = ['#593483','#E7DA57', '#5A3614', '#6CE2AC','salmon','#0015BC']
fig1, ax1 = plt.subplots()
ax1.pie(sizes, colors = colors, labels=labels, autopct='%1.1f%%', startangle=45)
#draw circle
centre_circle = plt.Circle((0,0),0.75,fc='white')
#fig = plt.figure(figsize=(10, 10))
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
# Equal aspect ratio ensures that pie is drawn as a circle
ax1.axis('equal')
plt.title("Frequency of Authors' number of books")
plt.tight_layout()
#plt.show()
plt.savefig("circle.png")
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
books = pd.read_csv('book_data.csv',error_bad_lines = False)
g= []
for i in range(books.shape[0]):
if isinstance(books['genres'].iloc[i], str) and len(books['genres'].iloc[i])>0:
g.append(books['genres'].iloc[i])
#figuring out all the existing genres and saving the genres in Genres_list list
Genres_dict = {}
for i in range(len(books)):
row = books.iloc[i]
Gen = str(row.genres)
genres = Gen.split('|')
for genre in genres:
if genre not in Genres_dict:
Genres_dict[str(genre)] = 1
else:
Genres_dict[str(genre)] +=1
#len(Genres_list)
#word_could_dict=Counter(g)
custom_mask = np.array(Image.open("book.png"))
wordcloud = WordCloud(background_color="white", collocations=False, mask=custom_mask, contour_width=1, contour_color='gray').generate_from_frequencies(Genres_dict)
#wc = WordCloud(background_color="white", mask=custom_mask)
#wc = WordCloud(background_color="white", collocations=False, mask=custom_mask, contour_width=1, contour_color='gray')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig("wc143.png")
plt.show()